Compute summary statistics.
load(file = "predictions_cards.rda")
load(file = "score_cards_county.rda")
# load(file = "score_cards_state.rda")
score_cards_county_cmu = score_cards_county %>%
filter(forecaster == "CMU-TimeSeries") %>%
distinct(ahead, geo_value, target_end_date)
score_cards_county = semi_join(score_cards_county, score_cards_county_cmu)
recent_score_by_frcstr = score_cards_county %>%
filter(target_end_date > today() - 28) %>%
group_by(forecaster, ahead) %>%
summarize(num = sum(!is.na(wis))) %>%
pivot_wider(names_from = ahead,
names_prefix = "num_",
values_from = num)
recent_score_by_frcstr = recent_score_by_frcstr %>%
mutate(num_total = num_1 + num_2 + num_3 + num_4) %>%
arrange(desc(num_total))
print(recent_score_by_frcstr, n = Inf)
## # A tibble: 7 x 6
## # Groups: forecaster [7]
## forecaster num_1 num_2 num_3 num_4 num_total
## <chr> <int> <int> <int> <int> <int>
## 1 CMU-TimeSeries 6400 6400 6400 6400 25600
## 2 COVIDhub-baseline 6400 6400 6400 6400 25600
## 3 COVIDhub-ensemble 6400 6400 6400 6400 25600
## 4 UCLA-SuEIR 6400 6400 6400 6400 25600
## 5 LANL-GrowthRate 6368 6368 6368 6336 25440
## 6 UMass-MechBayes 6248 6160 6192 6240 24840
## 7 JHU_IDD-CovidSP 6400 4800 4800 4800 20800
cmu_forecasts = recent_score_by_frcstr[recent_score_by_frcstr$forecaster == "CMU-TimeSeries",]$num_total
active_forecasters = recent_score_by_frcstr[recent_score_by_frcstr$num_total > cmu_forecasts * 0.80,]$forecaster
# CovidAnalytics-DELPHI and IHME-CurveFit didn't submit that many forecasts ...
score_cards_county = score_cards_county %>%
filter(forecaster %in% active_forecasters)
distinct_averagers <- score_cards_county %>%
select(geo_value) %>%
distinct()
First we make dot plots (not the same as that in evalcast): one dot per ahead, forecaster, and forecast date. The red plus marks the score computed over all dates. Here we use the mean as the aggregator function, and we study WIS, AE, and coverage-80.
# Define mean and median functions that deal with missingness well
Mean = function(x) mean(x, na.rm = TRUE)
Median = function(x) median(x, na.rm = TRUE)
summarize_var = function(df, var, aggr = Mean) {
df <- evalcast::intersect_averagers(
df, c("forecaster", "ahead", "target_end_date"), "geo_value")
df_by_date = df %>%
group_by(forecaster, ahead, target_end_date) %>%
summarize(var = aggr(!!as.symbol(var))) %>%
ungroup()
df_overall = df %>%
group_by(forecaster, ahead) %>%
summarize(var_overall = aggr(!!as.symbol(var))) %>%
ungroup() %>% group_by(ahead) %>%
arrange(var_overall, .by_group = TRUE) %>%
ungroup() %>%
mutate(order = row_number())
df_sum = full_join(df_by_date, df_overall, by = c("forecaster", "ahead"))
}
dot_plot = function(df, var = "wis", ylab = var, ylim = NULL, aggr = Mean) {
df_sum = summarize_var(df, var, aggr)
df_sum$ahead = factor(paste("ahead =", df_sum$ahead))
ggplot(df_sum, aes(x = order, y = var)) +
geom_point(aes(color = target_end_date)) +
geom_point(aes(x = order, y = var_overall), color = "red", shape = 3) +
facet_wrap(vars(ahead), scales = "free") +
labs(x = "Forecaster", y = ylab) +
scale_x_continuous(breaks = df_sum$order, labels = df_sum$forecaster) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8)) +
coord_cartesian(ylim = ylim)
}
Now we make line plots: one line per ahead and forecaster, as a function of f orecast date. Here we use the mean as the aggregator function, and we look at WIS and coverage-80.
# From https://stackoverflow.com/questions/15282580/
color_picker = function(n) {
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
}
line_plot = function(df, var = "wis", ylab = var, ylim = NULL, aggr = Mean) {
df_sum = summarize_var(df, var, aggr)
df_sum$ahead = factor(paste("ahead =", df_sum$ahead))
ggplot(df_sum, aes(x = target_end_date, y = var)) +
geom_line(aes(color = forecaster)) +
geom_point(aes(color = forecaster, shape = forecaster)) +
facet_wrap(vars(ahead), scales = "free") +
labs(x = "Date", y = ylab) +
theme_bw() +
coord_cartesian(ylim = ylim) +
scale_color_manual(values = color_picker(length(unique(forecaster))))
}
line_plot(score_cards_county, var = "wis", ylab = "Mean WIS") + scale_y_log10()
line_plot(score_cards_county, var = "cov_80", ylab = "Coverage-80", ylim = c(0,1)) +
geom_hline(yintercept = 0.8)
line_plot(score_cards_county, var = "ae", ylab="absolute error")
itval_widths <- evalcast::compute_width(
score_cards_county, 0.5,
c("forecaster","ahead","target_end_date"), "geo_value")
shape_picker <- function(n) {
s = c(16,17,15,3,7,8,10,0,2)
rep(s, length.out = n)
}
nvals <- length(unique(itval_widths$forecaster))
itval_widths %>%
filter(abs(nominal_prob - .8) < 1e-8) %>%
ggplot(aes(x=target_end_date, y=width)) +
geom_line(aes(color = forecaster)) +
geom_point(aes(color = forecaster, shape = forecaster)) +
facet_wrap(~ahead, scales = "free") +
labs(x = "Date", y = "Median width of 80% CI") +
theme_bw() +
scale_y_log10() +
scale_shape_manual(values = shape_picker(nvals)) +
scale_color_manual(values = color_picker(nvals))
We scale each score, per location and forecast date, by the COVIDhub-baseline score; then we take the mean or median.
Important note on the order of operations here: scale then aggregate. The other way around: aggregate then scale, would be a simple post-adjustment applied to the metrics we computed earlier. This way: scale then aggregate, results in a different final metric altogether. It is potentially interesting as it provides a nonparametric spatiotemporal adjustment; assuming that space and time effects are multiplicative, we’re directly “canceling them out” by taking ratios.
Here are dot plots for scaled WIS, with mean as the aggregator. Omitting median for brevity.
# Note to self: mutate_at() gave me a weird bug below! From now on, better use
# mutate() with across() instead ...
scale_df = function(df, var, base_forecaster = "COVIDhub-baseline") {
df %>% select(-c(quantile, value, forecast_date)) %>%
distinct() %>%
evalcast::intersect_averagers(
c("forecaster","ahead","target_end_date"), "geo_value") %>%
# added by djm
pivot_wider(id_cols = c(geo_value, target_end_date, ahead),
names_from = "forecaster", names_prefix = var,
values_from = var) %>%
mutate(across(starts_with(var), ~ .x /
!!as.symbol(paste0(var, base_forecaster)))) %>%
pivot_longer(cols = starts_with(var), names_to = "forecaster",
values_to = "scaled") %>%
mutate(forecaster = substring(forecaster, nchar(var) + 1)) %>%
filter(forecaster != base_forecaster)
}
line_plot(scale_df(score_cards_county, var = "wis"),
var = "scaled", ylab = "Mean scaled WIS") +
geom_hline(yintercept = 1)
Similar to what we did previously but just with centering instead of scaling.
Note on order of operations: center then aggregate versus aggregate then center are still in general different strategies. As before we’re adhering to the first way, with a similar movitation: if space and time effects were now additive, then this way would “cancel them out” directly. However, when the aggregator is a linear operation (e.g., mean), the two strategies essentially reduce to the same thing (“essentially”, not exactl, because setting na.rm = TRUE generally turns any linear operator into a nonlinear one).
Here are the dot plots for mean centered WIS. Omitting median for brevity.
center_df = function(df, var, base_forecaster = "COVIDhub-baseline") {
scale_df(df %>% mutate(y = exp(!!as.symbol(var))), "y", base_forecaster) %>%
mutate(centered = log(scaled)) %>% select(-scaled)
}
DJM: There’s some sort of problem here that causes an issue. I’m not sure what it is.
dot_plot(center_df(score_cards_county, var = "wis"), var = "centered",
ylab = "Mean centered WIS") + geom_hline(yintercept = 0)
Here are now the line plots for mean centered WIS.
line_plot(center_df(score_cards_county, var = "wis"), var = "centered",
ylab = "Mean centered WIS") + geom_hline(yintercept = 0)
We run a pairwise tournament. This is inspired by Johannes Bracher’s analysis (and similar ideas in the literature). Except, the order of operations here is different: scale then aggregate (whereas Johannes did: aggregate then scale). The motivation for this was explained above (thinking of it as providing a nonparametric spatiotemporal adjustment), as was the fact that the order of operations really makes a difference.
For each pair of forecasters \(f\) and \(g\), we compute:
\[ \theta_{fg} = A\bigg\{ \frac{S(f;\ell,d,a)}{S(g;\ell,d,a)} \;:\; \text{common locations $\ell$, forecast dates $d$, and ahead values $a$} \bigg\} \]
where \(S\) is a score of interest, say WIS, and \(A\) is an aggregator of interest, say the mean. Important note: we aggregate over all common locations, dates, and ahead values, which may differ for each pair \(f,g\). To compute an overall metric for forecaster \(f\), we use:
\[ \theta_f = \bigg( \prod_g \theta_{fg} \bigg)^{1/F}. \]
the geometric mean of all pairwise comparisons of \(f\) to other forecasters (here \(F\) is the total number of forecasters). Another interesting option would be to define \((\theta_f)_{f \in F}\) as the top left singular vector of the matrix \((\theta_{fg})_{f,g \in F}\), which we’ll also investigate.
pairwise_tournament = function(df, var, aggr = Mean) {
forecasters = unique(df$forecaster)
theta_mat = matrix(NA, length(forecasters), length(forecasters))
rownames(theta_mat) = colnames(theta_mat) = forecasters
for (f in forecasters) {
result = scale_df(df, var, base_forecaster = f) %>%
group_by(forecaster) %>%
summarize(v = aggr(scaled))
theta_mat[result$forecaster, f] = result$v
}
# Convert to data frame for convenience with ggplot
theta_df = as.data.frame(theta_mat) %>%
mutate(Forecaster1 = forecasters) %>%
pivot_longer(cols = -Forecaster1, names_to = "Forecaster2",
values_to = "value")
# Compute overall metrics two ways: geometric mean, SVD
theta_vec1 = exp(rowMeans(log(theta_mat), na.rm = TRUE))
diag(theta_mat) = 1 # so the SVD won't fail; undo it later
theta_vec2 = as.numeric(svd(theta_mat, nu = 1)$u)
names(theta_vec2) = names(theta_vec1)
diag(theta_mat) = NA
return(list(mat = theta_mat, df = theta_df, vec1 = theta_vec1,
vec2 = theta_vec2))
}
theta_county = pairwise_tournament(score_cards_county, var = "wis", aggr =
function(x) mean(x, trim = 0.01, na.rm = TRUE))
ranked_list = rownames(theta_county$mat)[order(theta_county$vec1)]
colors = colorRampPalette(brewer.pal(n = 6, name = "RdBu"))(30)
ggplot(theta_county$df, aes(x = factor(Forecaster2, levels = rev(ranked_list)),
y = factor(Forecaster1, levels = rev(ranked_list)))) +
geom_tile(aes(fill = value)) +
geom_text(aes(label = round(value, 3))) +
scale_fill_gradientn(colours = colors) +
labs(x = NULL, y = NULL) +
theme_bw() + theme(legend.position = "none",
axis.text.x = element_text(angle = 90, hjust = 1))
# Overall metric (computed via GM of pairwise metrics):
knitr::kable(data.frame(rank = 1:length(theta_county$vec1), forecaster = ranked_list,
theta_county = sort(theta_county$vec1), row.names = NULL))
| rank | forecaster | theta_county |
|---|---|---|
| 1 | CMU-TimeSeries | 0.8969041 |
| 2 | COVIDhub-ensemble | 0.8981448 |
| 3 | COVIDhub-baseline | 1.0962243 |
| 4 | LANL-GrowthRate | 1.1651917 |
| 5 | UMass-MechBayes | 1.2631557 |
| 6 | UCLA-SuEIR | 1.8866113 |
| 7 | JHU_IDD-CovidSP | 2.3600358 |
For curiosity, we can plot the agreement the overall metric computed via GM and SVD. The agreement is basically perfect!
ggplot() + geom_point(aes(x = theta_county$vec1, y = theta_county$vec2)) +
labs(x = "Geometric mean", y = "Top left singular vector")
Repeat the same pairwise tournament but with the median as the aggregator.
theta_county = pairwise_tournament(score_cards_county, var = "wis", aggr = Median)
ranked_list = rownames(theta_county$mat)[order(theta_county$vec1)]
colors = colorRampPalette(brewer.pal(n = 6, name = "RdBu"))(30)
ggplot(theta_county$df, aes(x = factor(Forecaster2, levels = rev(ranked_list)),
y = factor(Forecaster1, levels = rev(ranked_list)))) +
geom_tile(aes(fill = value)) +
geom_text(aes(label = round(value, 3))) +
scale_fill_gradientn(colours = colors) +
labs(x = NULL, y = NULL) +
theme_bw() + theme(legend.position = "none",
axis.text.x = element_text(angle = 90, hjust = 1))
# Overall metric (computed via GM of pairwise metrics):
knitr::kable(data.frame(rank = 1:length(theta_county$vec1), forecaster = ranked_list,
theta_county = sort(theta_county$vec1), row.names = NULL))
| rank | forecaster | theta_county |
|---|---|---|
| 1 | CMU-TimeSeries | 0.6850103 |
| 2 | COVIDhub-ensemble | 0.7731633 |
| 3 | COVIDhub-baseline | 0.8439016 |
| 4 | UMass-MechBayes | 0.9263274 |
| 5 | LANL-GrowthRate | 0.9323508 |
| 6 | UCLA-SuEIR | 1.4306299 |
| 7 | JHU_IDD-CovidSP | 1.8107928 |
And now with the 90th percentile as the aggregator.
theta_county = pairwise_tournament(score_cards_county, var = "wis", aggr =
function(x) quantile(x, prob = 0.9, na.rm = TRUE))
ranked_list = rownames(theta_county$mat)[order(theta_county$vec1)]
colors = colorRampPalette(brewer.pal(n = 6, name = "RdBu"))(30)
ggplot(theta_county$df, aes(x = factor(Forecaster2, levels = rev(ranked_list)),
y = factor(Forecaster1, levels = rev(ranked_list)))) +
geom_tile(aes(fill = value)) +
geom_text(aes(label = round(value, 3))) +
scale_fill_gradientn(colours = colors) +
labs(x = NULL, y = NULL) +
theme_bw() + theme(legend.position = "none",
axis.text.x = element_text(angle = 90, hjust = 1))
# Overall metric (computed via GM of pairwise metrics):
knitr::kable(data.frame(rank = 1:length(theta_county$vec1), forecaster = ranked_list,
theta_county = sort(theta_county$vec1), row.names = NULL))
| rank | forecaster | theta_county |
|---|---|---|
| 1 | COVIDhub-ensemble | 1.611965 |
| 2 | CMU-TimeSeries | 1.791145 |
| 3 | COVIDhub-baseline | 2.186873 |
| 4 | LANL-GrowthRate | 2.261593 |
| 5 | UMass-MechBayes | 2.576636 |
| 6 | UCLA-SuEIR | 3.947735 |
| 7 | JHU_IDD-CovidSP | 4.771489 |
fips = "06037"
pcards_cmu = filter(predictions_cards, forecaster == "CMU-TimeSeries")
pcards_ens = predictions_cards %>%
filter(forecaster == "COVIDhub-ensemble")
la_actual = covidcast_signal("usa-facts",
"confirmed_incidence_num","2020-07-01",
geo_values = fips)
la_epi = evalcast:::sum_to_epiweek(la_actual) %>%
rename(target_end_date = time_value)
p1 = filter(pcards_cmu, geo_value == fips) %>%
plot_trajectory(intervals = 0.8, side_truth = la_epi, plot_it = FALSE) +
ggtitle("CMU-TimeSeries") + theme(legend.position = "none")
p2 = filter(pcards_ens, geo_value == fips) %>%
plot_trajectory(intervals = 0.8, side_truth = la_epi, plot_it = FALSE) +
ggtitle("COVIDhub-ensemble") +
theme(legend.position = "none")
cowplot::plot_grid(p1, p2)
fips = "42003"
pgh_actual = covidcast_signal("usa-facts",
"confirmed_incidence_num","2020-07-01",
geo_values = fips)
pgh_epi = evalcast:::sum_to_epiweek(pgh_actual) %>%
rename(target_end_date = time_value)
p1 = filter(pcards_cmu, geo_value == fips) %>%
plot_trajectory(intervals = 0.8, side_truth = pgh_epi, plot_it = FALSE) +
ggtitle("CMU-TimeSeries") + theme(legend.position = "none")
p2 = filter(pcards_ens, geo_value == fips) %>%
plot_trajectory(intervals = 0.8, side_truth = pgh_epi, plot_it = FALSE) +
ggtitle("COVIDhub-ensemble") +
theme(legend.position = "none")
cowplot::plot_grid(p1, p2)
fips = "55025"
mad_actual = covidcast_signal("usa-facts",
"confirmed_incidence_num","2020-07-01",
geo_values = fips)
mad_epi = evalcast:::sum_to_epiweek(mad_actual) %>%
rename(target_end_date = time_value)
p1 = filter(pcards_cmu, geo_value == fips) %>%
plot_trajectory(intervals = 0.8, side_truth = mad_epi, plot_it = FALSE) +
ggtitle("CMU-TimeSeries") + theme(legend.position = "none")
p2 = filter(pcards_ens, geo_value == fips) %>%
plot_trajectory(intervals = 0.8, side_truth = mad_epi, plot_it = FALSE) +
ggtitle("COVIDhub-ensemble") +
theme(legend.position = "none")
cowplot::plot_grid(p1, p2)